# OTU table
otu_table = read_tsv("../data/otu-table.tsv", skip = 1)
otu_id = otu_table$`#OTU ID`
otu_table = data.frame(otu_table[, -1], check.names = FALSE, row.names = otu_id)

# Taxonomy table
tax = read_tsv("../data/taxonomy.tsv")
otu_id = tax$`Feature ID`
tax = data.frame(tax[, - c(1, 3)], row.names = otu_id)
tax = tax %>% separate(col = Taxon, 
                       into = c("Kingdom", "Phylum", "Class", "Order", 
                                "Family", "Genus", "Species"),
                       sep = ";")
for (i in 1:ncol(tax)) {
  tax[, i] = sapply(tax[, i], function(x) str_split(x, "__")[[1]][2])
}
tax = as.matrix(tax)
tax[tax == ""] = NA

# Tree
tree = read_tree("../data/tree.nwk")

# Meta data
meta_data = read_tsv("../data/metadata.tsv")
meta_data = meta_data %>%
  mutate_if(is.character, as.factor)
meta_data$sampleid = as.character(meta_data$sampleid)
meta_data$caregiver_stress_level = factor(meta_data$caregiver_stress_level,
                                          levels = c("Low",  "Medium",  "High"))
meta_data$depression_level = factor(meta_data$depression_level,
                                    levels = c("Low",  "Medium",  "High"))
meta_data$hostility_level = factor(meta_data$hostility_level,
                                   levels = c("Low",  "Medium",  "High"))
meta_data$das_level = factor(meta_data$das_level,
                             levels = c("Low",  "Medium",  "High"))
meta_data$metabolic_syndrome_level = factor(meta_data$metabolic_syndrome_level,
                                            levels = c("No or Mild", 
                                                       "Moderate", 
                                                       "Severe"))

OTU = otu_table(otu_table, taxa_are_rows = TRUE)
META = sample_data(meta_data)
sample_names(META) = meta_data$sampleid
TAX = tax_table(tax)
otu_data = phyloseq(OTU, TAX, META, tree)
pipeline = function(pseq, sample_id, adj_formula, group){
  feature_table = abundances(pseq)
  meta_data = meta(pseq)
  
  p_adj_method = "BH"; zero_cut = 0.90; lib_cut = 1000
  struc_zero = TRUE; neg_lb = FALSE; tol = 1e-5
  max_iter = 100; conserve = FALSE; alpha = 0.05
  per_num = 1000; global = FALSE; direct = TRUE
  dunnett = FALSE; pattern = NULL
  out = ANCOM_BC(feature_table, meta_data, sample_id, adj_formula, p_adj_method, 
                 zero_cut, lib_cut, struc_zero, neg_lb, group, tol, max_iter, 
                 conserve, alpha, per_num, global, direct, dunnett, pattern)
  
  if(is.null(out$res_direct)) {
    res = out$res
  } else {
    res = out$res_direct
  }
  res_beta = data.frame(res$beta * res$diff_abn, check.names = FALSE) %>% 
    rownames_to_column("taxon_id")
  res_se = data.frame(res$se * res$diff_abn, check.names = FALSE) %>% 
    rownames_to_column("taxon_id")
  colnames(res_se)[-1] = paste0(colnames(res_se)[-1], "SD")
  res_model = res_beta %>% 
    left_join(res_se, by = "taxon_id") %>%
    dplyr::select(-matches("Intercept"))
  
  # Heatmap
  df_fig = res_beta %>% 
    dplyr::select(-matches("Intercept"))
  level_ch = levels(as.factor(meta_data[, group]))
  col_label = outer(level_ch, level_ch, function(x, y) paste0(x, " - ", y))
  col_label = col_label[lower.tri(col_label)]
  colnames(df_fig) = c("taxon_id", col_label)
  df_fig_long = df_fig %>% 
    gather(key = "key", value = "value", -taxon_id)
  df_fig_long$taxon_id = factor(df_fig_long$taxon_id)
  
  p_heat = df_fig_long %>%
    ggplot(aes(x = key, y = taxon_id)) +
    scale_x_discrete(expand = c(0, 0)) + 
    labs(x = NULL, y = NULL, fill = "Log fold change") + 
    geom_tile(aes(fill = value)) + 
    scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 60, hjust = 1))
  
  # Relative abundance plot
  df_fig_long = data.frame(t(abundances(pseq)), check.names = FALSE) %>% 
    rownames_to_column(sample_id) %>%
    pivot_longer(-sample_id, names_to = "taxon", values_to = "value") %>%
    left_join(meta_data %>% 
                dplyr::select(sample_id, group))
  colnames(df_fig_long)[ncol(df_fig_long)] = "group"
  
  df_fig_wide = df_fig_long %>%
    filter(!is.na(group)) %>%
    group_by(group, taxon) %>%
    summarise(value = sum(value)) %>%
    pivot_wider(id_cols = "taxon",
                names_from = "group",
                values_from = "value") %>%
    mutate(total = rowSums(select_if(., is.numeric))) %>%
    mutate_if(is.numeric, function(x) 100 * x/sum(x)) %>%
    mutate(taxon = case_when(
      taxon == "Unknown" ~ "Others",
      total < 2 ~ "Others",
      TRUE ~ taxon)) %>%
    group_by(taxon) %>%
    summarise_all(sum)
  
  df_fig = df_fig_wide %>%
    dplyr::select(-total) %>%
    pivot_longer(-taxon, names_to = "group", values_to = "value")
  df_fig$taxon = forcats::fct_relevel(df_fig$taxon, "Others", after = Inf)
  
  p_rel = df_fig %>%
    ggplot(aes(x = group, y = value, fill = taxon)) +
    geom_col(position = position_stack(), color = "black") + 
    labs(x = NULL, y = "Relative abundance (%)") + 
    theme_bw() +
    scale_fill_discrete(name = NULL) +
    theme(legend.position = "right",
          axis.ticks.x = element_blank(),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
          strip.background = element_rect(fill="white"))
  
  obj = list(out = res_model, p_heat = p_heat, p_rel = p_rel)
  return(obj)
}

1. Analyses at family level

# Aggregate taxa
family_data = aggregate_taxa(otu_data, "Family")
pseq = family_data
sample_id = "sampleid"

covariates = c("alcohol", "caregiver_stress_level", "depression_level", 
               "hostility_level", "das_level", "metabolic_syndrome_level")
labels = c("Alcohol Use", "Caregiver Stress Level", "Depression Level",
           "Hostility Level", "DAS Level", "Metabolic Syndrome Level")

for (i in seq_along(covariates)) {
  cat("\n \n \n")
  covariate = covariates[i]
  label = labels[i]
  adj_formula = covariate
  group = covariate
  obj = pipeline(pseq, sample_id, adj_formula, group)
  
  file_path = paste0("../outputs/ancombc_family_", covariate, ".csv")
  res = obj$out
  res[, -1] = signif(res[, -1], 3)
  write_csv(res, file_path)
  
  heat_path1 = paste0("../images/abundance/family_heatmap_", covariate, ".jpeg")
  heat_path2 = paste0("../images/abundance/family_heatmap_", covariate, ".pdf")
  p1 = obj$p_heat + 
    labs(title = label) +
    theme(plot.title = element_text(hjust = 0.5))
  print(p1)
  cat("\n \n \n")
  ggsave(filename = heat_path1, plot = p1, height = 5, 
         width = 6.25, units = 'in', dpi = 300)
  ggsave(filename = heat_path2, plot = p1, height = 5, width = 6.25)
  
  rel_path1 = paste0("../images/abundance/family_relative_", covariate, ".jpeg")
  rel_path2 = paste0("../images/abundance/family_relative_", covariate, ".pdf")
  p2 = obj$p_rel + 
    labs(title = label) +
    scale_fill_brewer(palette = "Paired") +
    theme(plot.title = element_text(hjust = 0.5))
  print(p2)
  cat("\n \n \n")
  ggsave(filename = rel_path1, plot = p2, height = 5, 
         width = 6.25, units = 'in', dpi = 300)
  ggsave(filename = rel_path2, plot = p2, height = 5, width = 6.25)
}

2. Analyses at genus level

# Aggregate taxa
genus_data = aggregate_taxa(otu_data, "Genus")
genus_data2 = merge_taxa2(genus_data, pattern = "\\Clostridium", name = "Clostridium")
pseq = genus_data2
sample_id = "sampleid"

covariates = c("alcohol", "caregiver_stress_level", "depression_level", 
               "hostility_level", "das_level", "metabolic_syndrome_level")
labels = c("Alcohol Use", "Caregiver Stress Level", "Depression Level",
           "Hostility Level", "DAS Level", "Metabolic Syndrome Level")

for (i in seq_along(covariates)) {
  cat("\n \n \n")
  covariate = covariates[i]
  label = labels[i]
  adj_formula = covariate
  group = covariate
  obj = pipeline(pseq, sample_id, adj_formula, group)
  
  file_path = paste0("../outputs/ancombc_genus_", covariate, ".csv")
  res = obj$out
  res[, -1] = signif(res[, -1], 3)
  write_csv(res, file_path)
  
  heat_path1 = paste0("../images/abundance/genus_heatmap_", covariate, ".jpeg")
  heat_path2 = paste0("../images/abundance/genus_heatmap_", covariate, ".pdf")
  p1 = obj$p_heat + 
    labs(title = label) +
    theme(plot.title = element_text(hjust = 0.5))
  print(p1)
  cat("\n \n \n")
  ggsave(filename = heat_path1, plot = p1, height = 5, 
         width = 6.25, units = 'in', dpi = 300)
  ggsave(filename = heat_path2, plot = p1, height = 5, width = 6.25)
  
  rel_path1 = paste0("../images/abundance/genus_relative_", covariate, ".jpeg")
  rel_path2 = paste0("../images/abundance/genus_relative_", covariate, ".pdf")
  p2 = obj$p_rel + 
    labs(title = label) +
    theme(plot.title = element_text(hjust = 0.5))
  print(p2)
  cat("\n \n \n")
  ggsave(filename = rel_path1, plot = p2, height = 5, 
         width = 6.25, units = 'in', dpi = 300)
  ggsave(filename = rel_path2, plot = p2, height = 5, width = 6.25)
}

Session information

sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS  10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] CVXR_1.0-8         nloptr_1.2.2.2     MASS_7.3-53        kableExtra_1.2.1  
 [5] RColorBrewer_1.1-2 knitr_1.30         qwraps2_0.5.1      magrittr_2.0.1    
 [9] compositions_2.0-0 vegan_2.5-6        lattice_0.20-41    permute_0.9-5     
[13] microbiome_1.11.0  phyloseq_1.33.0    forcats_0.5.1      stringr_1.4.0     
[17] dplyr_1.0.2        purrr_0.3.4        tidyr_1.1.2        tibble_3.0.3      
[21] ggplot2_3.3.2      tidyverse_1.3.0    openxlsx_4.2.2     readr_1.3.1       

loaded via a namespace (and not attached):
 [1] Rtsne_0.15          colorspace_1.4-1    ellipsis_0.3.1     
 [4] XVector_0.29.3      fs_1.5.0            rstudioapi_0.11    
 [7] farver_2.0.3        bit64_4.0.5         fansi_0.4.1        
[10] lubridate_1.7.9     xml2_1.3.2          codetools_0.2-16   
[13] splines_4.0.2       robustbase_0.93-6   ade4_1.7-15        
[16] jsonlite_1.7.1      broom_0.7.0         cluster_2.1.0      
[19] Rmpfr_0.8-1         dbplyr_1.4.4        compiler_4.0.2     
[22] httr_1.4.2          backports_1.1.10    assertthat_0.2.1   
[25] Matrix_1.2-18       cli_2.0.2           htmltools_0.5.0    
[28] tools_4.0.2         gmp_0.6-0           igraph_1.2.6       
[31] gtable_0.3.0        glue_1.4.2          reshape2_1.4.4     
[34] Rcpp_1.0.6          Biobase_2.49.1      cellranger_1.1.0   
[37] vctrs_0.3.4         Biostrings_2.57.2   rhdf5filters_1.1.2 
[40] multtest_2.45.0     ape_5.4-1           nlme_3.1-149       
[43] iterators_1.0.13    tensorA_0.36.1      xfun_0.17          
[46] rvest_0.3.6         lifecycle_0.2.0     DEoptimR_1.0-8     
[49] zlibbioc_1.35.0     scales_1.1.1        hms_0.5.3          
[52] parallel_4.0.2      biomformat_1.17.0   rhdf5_2.33.9       
[55] yaml_2.2.1          stringi_1.5.3       S4Vectors_0.27.13  
[58] foreach_1.5.1       BiocGenerics_0.35.4 zip_2.1.1          
[61] rlang_0.4.11        pkgconfig_2.0.3     evaluate_0.14      
[64] Rhdf5lib_1.11.3     labeling_0.3        bit_4.0.4          
[67] tidyselect_1.1.0    plyr_1.8.6          R6_2.4.1           
[70] IRanges_2.23.10     generics_0.0.2      DBI_1.1.0          
[73] pillar_1.4.6        haven_2.3.1         withr_2.3.0        
[76] mgcv_1.8-33         survival_3.2-3      bayesm_3.1-4       
[79] modelr_0.1.8        crayon_1.3.4        rmarkdown_2.3      
[82] grid_4.0.2          readxl_1.3.1        data.table_1.14.0  
[85] blob_1.2.1          reprex_0.3.0        digest_0.6.25      
[88] webshot_0.5.2       stats4_4.0.2        munsell_0.5.0      
[91] viridisLite_0.3.0